Code
devtools::install_github("f-dallolio/adtabler")
1
library(tidyverse, quietly = TRUE)
library(tsibble, quietly = TRUE)
library(rlang, quietly = TRUE)
library(glue, quietly = TRUE)
library(adtabler, quietly = TRUE)
library(furrr)First, I load the required packages.
devtools::install_github("f-dallolio/adtabler")
1
library(tidyverse, quietly = TRUE)
library(tsibble, quietly = TRUE)
library(rlang, quietly = TRUE)
library(glue, quietly = TRUE)
library(adtabler, quietly = TRUE)
library(furrr)Define the directory containing the AdIntel folders and retrieve data paths.
adintel_dir_home <- "/mnt/sata_data_1/adintel"
adintel_folder <- list.files(adintel_dir_home, full.names = TRUE) |>
stringr::str_replace_all("//", "/")
adintel_all_files_old <- list.files(adintel_dir_home,
full.names = TRUE,
recursive = TRUE
)
adintel_all_files_new <- adintel_all_files_old |>
stringr::str_replace_all(" ", "_") |>
stringr::str_replace_all("//", "/")file.rename(from = adintel_all_files_old, to = adintel_all_files_new) adintel_all_files <- list.files(adintel_dir_home,
full.names = TRUE,
recursive = TRUE
)Retrieve file paths for 2020 “dynamic” files (i.e. they can change year by year):
adintel_files_dyn <- adintel_all_files |>
str_subset(pattern = no_case("master_file"), negate = TRUE) |>
as_tibble_col("full_file_name") |>
mutate(
year = as_numeric2(str_split_i(full_file_name, "/", -3)),
file_type_data = str_split_i(full_file_name, "/", -2),
file_name_data = str_split_i(full_file_name, "/", -1),
file_type_std = rename_adintel(file_type_data),
file_name_std = rename_adintel(str_split_i(file_name_data, "\\.", 1)),
.before = full_file_name
) |>
arrange(year, file_type_data, file_name_data)Retrieve file paths for “static” files (i.e. they do not change year by year):
adintel_files_static <- adintel_all_files |>
str_subset(pattern = no_case("master_file")) |>
str_subset(pattern = no_case("Latest")) |>
str_subset(pattern = as.character(max(adintel_files_dyn$year))) |>
as_tibble_col("full_file_name") |>
mutate(
year = NA,
file_type_data = "References",
file_name_data = str_split_i(full_file_name, "/", -1),
file_type_std = rename_adintel(file_type_data),
file_name_std = rename_adintel(str_split_i(file_name_data, "\\.", 1)),
.before = full_file_name
) |>
arrange(file_type_data, file_name_data)The function takes the full path of a file (full_file_name) and returns a data frame/tibble with the the number of columns (n_cols), the original column names (col_name_data) an their positions (col_pos), an the new column names standardized with the function adtabler::rename_adintel().
nested tibble with the name of the file (file), the string used (sep) to separate columns, the number of rows (n_rows), the number of columns (n_cols), and the column names (col_name_man). I use _man at the end of the column indicating the column names in the original files to indicate that it is the name used in the Adintel _manual. The column col_name_std contains standardized column names. Standardization is performed by the function adtabler::rename_adintel().
fn_col_info <- function(file) {
x <- file
# load the first 100 rows of data
df <- data.table::fread(
file = x,
nrows = 100
)
# retrieve column names
col_name_data <- names(df)
# retrieve number of columns
n_cols <- NCOL(df)
# retrieve the position of columns in the original data
col_pos <- seq_along(col_name_data)
# standardize column names
col_name_std <- rename_adintel(col_name_data)
# return a tibble/data frame with:
# Number of columns (1),
# original column names (3) and their positions (2),
# standardized column names.
tibble(
n_cols,
col_pos,
col_name_data,
col_name_std
)
}The data frame row_numbers below is a pre-calculated table that contains the number of rows for each file. The number of rows was efficiently computed with the read_nrows function.
data("row_numbers")
data_info <- adintel_files_dyn |>
bind_rows(
adintel_files_static
) |>
mutate(
data = full_file_name |>
map(
.f = ~ fn_col_info(.x)
)
) |>
unnest(
everything()
) |>
inner_join(
row_numbers
) |>
select(
- n_lines_w_header
) |>
relocate(
n_rows, .before = n_cols
) |>
relocate(
full_file_name, .after = last_col()
)Joining with `by = join_by(full_file_name)`
data_infodata("references_columns")
references_columnsdata("occurrences_columns")
occurrences_columnsdata("lookup_datatype")
lookup_datatyperefer_occurr_info <- occurrences_columns |>
nest( unique_key_info = c(media_type_id, col_unique_key) ) |>
bind_rows(
references_columns |> nest(unique_key_info = col_unique_key)
) |>
left_join(
lookup_datatype
) |>
relocate(
datatype_r, datatype_sql, .before = sql_precision
) |>
nest(
data_type_info = c(
datatype_r, datatype_sql,
sql_precision, sql_scale,
datatype_man, datatype_std
),
manual_info = c(col_name_man, description
)
)Joining with `by = join_by(datatype_man)`
data_info <- data_info |>
nest(
data_col_original = contains("_data"),
data_file_info = c(year, n_rows, n_cols, full_file_name)
) |>
left_join(
refer_occurr_info
)Joining with `by = join_by(file_name_std, col_pos, col_name_std)`
data_infodata_info_unnested <- data_info |>
unnest(c(unique_key_info, data_type_info, manual_info)) |>
unnest(c(data_col_original, data_file_info))
data_info_unnestedrm(list = setdiff(ls(), c("data_info", "data_info_unnested")))unique_key <- data_info_unnested |>
summarise(col_unique_key =
list(unique(col_unique_key) |>
adtabler::na_rm() |>
paste0(collapse = ",")) |>
unlist(),
.by = c(file_type_std, file_name_std, media_type_id))
unique_keyunique_key_nested <- unique_key |>
nest(.by = c(file_type_std, file_name_std),
.key = "unique_key")
unique_key_nestedfile_info <- data_info |>
select(file_type_std, file_name_std, data_file_info, data_col_original) |>
unnest(everything()) |>
select(file_type_std,
file_name_std,
file_name_data,
col_name_data,
year : full_file_name) |>
nest(col_name_data = col_name_data) |>
mutate(col_name_data = col_name_data |> map(~ .x[[1]] |> paste(collapse = ",")) |> unlist())
file_info file_info_nested <- file_info |>
nest(.by = c(file_type_std, file_name_std),
.key = "col_info")
file_info_nestedcol_info <- data_info |>
select(file_type_std : col_name_std, data_type_info) |>
unnest(everything()) |>
select(file_type_std,
file_name_std,
col_pos,
col_name_std,
datatype_r : datatype_std) |>
distinct()
col_infocol_info_nested <- col_info |>
nest(.by = c(file_type_std, file_name_std),
.key = "col_info")
col_info_nestedmanual_info <- data_info |>
select(file_type_std, file_name_std, manual_info,data_type_info) |>
unnest(everything()) |>
select(file_type_std,
file_name_std,
c(contains("_man"), description)) |>
distinct() |>
relocate(col_name_man, .before = datatype_man)
manual_infomanual_info_nested <- manual_info |>
nest(.by = c(file_type_std, file_name_std),
.key = "manual_info")
manual_info_nesteddata_info_list <- list(
unique_key = unique_key,
file_info = file_info,
col_info = col_info,
manual_info = manual_info
)
data_info_list$unique_key
# A tibble: 51 × 4
file_type_std file_name_std media_type_id col_unique_key
<chr> <chr> <int> <chr>
1 occurrences cinema 27 ""
2 occurrences cinema 28 ""
3 occurrences fsi_coupon 12 "ad_date,market_code,media_type_id…
4 occurrences internet 25 ""
5 occurrences internet 26 ""
6 occurrences magazine 7 ""
7 occurrences magazine 8 ""
8 occurrences network_tv 1 ""
9 occurrences network_tv 4 ""
10 occurrences network_tv 2 ""
# ℹ 41 more rows
$file_info
# A tibble: 354 × 8
file_type_std file_name_std file_name_data year n_rows n_cols full_file_name
<chr> <chr> <chr> <int> <int> <int> <chr>
1 impressions imp_national… ImpNationalTV… 2010 1.70e7 48 /mnt/sata_dat…
2 impressions imp_national… ImpNationalTV… 2011 1.93e7 48 /mnt/sata_dat…
3 impressions imp_national… ImpNationalTV… 2012 1.99e7 48 /mnt/sata_dat…
4 impressions imp_national… ImpNationalTV… 2013 2.02e7 48 /mnt/sata_dat…
5 impressions imp_national… ImpNationalTV… 2014 2.10e7 48 /mnt/sata_dat…
6 impressions imp_national… ImpNationalTV… 2015 2.12e7 48 /mnt/sata_dat…
7 impressions imp_national… ImpNationalTV… 2016 2.12e7 48 /mnt/sata_dat…
8 impressions imp_national… ImpNationalTV… 2017 2.14e7 48 /mnt/sata_dat…
9 impressions imp_national… ImpNationalTV… 2018 2.18e7 48 /mnt/sata_dat…
10 impressions imp_national… ImpNationalTV… 2019 2.18e7 48 /mnt/sata_dat…
# ℹ 344 more rows
# ℹ 1 more variable: col_name_data <chr>
$col_info
# A tibble: 515 × 10
file_type_std file_name_std col_pos col_name_std datatype_r datatype_sql
<chr> <chr> <int> <chr> <chr> <chr>
1 impressions imp_national_tv 1 media_type_id <NA> <NA>
2 impressions imp_national_tv 2 data_stream_id <NA> <NA>
3 impressions imp_national_tv 3 distributor_id <NA> <NA>
4 impressions imp_national_tv 4 hispanic_flag <NA> <NA>
5 impressions imp_national_tv 5 impression_type <NA> <NA>
6 impressions imp_national_tv 6 nielsen_progra… <NA> <NA>
7 impressions imp_national_tv 7 telecast_number <NA> <NA>
8 impressions imp_national_tv 8 impression_date <NA> <NA>
9 impressions imp_national_tv 9 time_interval_… <NA> <NA>
10 impressions imp_national_tv 10 tvhh <NA> <NA>
# ℹ 505 more rows
# ℹ 4 more variables: sql_precision <chr>, sql_scale <chr>, datatype_man <chr>,
# datatype_std <chr>
$manual_info
# A tibble: 261 × 5
file_type_std file_name_std col_name_man datatype_man description
<chr> <chr> <chr> <chr> <chr>
1 impressions imp_national_tv <NA> <NA> <NA>
2 impressions imp_spot_radio <NA> <NA> <NA>
3 impressions imp_spot_tv <NA> <NA> <NA>
4 market_breaks imp_market_breaks <NA> <NA> <NA>
5 occurrences cinema AdDate Character Actual date of oc…
6 occurrences cinema MarketCode Character Code of Market, a…
7 occurrences cinema MediaTypeID SmallInt Code of Media Typ…
8 occurrences cinema PrimBrandCode Integer Code of Primary B…
9 occurrences cinema ScndBrandCode Integer Code of Secondary…
10 occurrences cinema TerBrandCode Integer Code of Tertiary …
# ℹ 251 more rows
data_info_nested <- tibble(
info_type = names(data_info_list),
info = data_info_list
)
data_info_nestedusethis::use_data(data_info_list, overwrite = T)✔ Setting active project to '/home/filippo/Documents/r_wd/adtabler'
✔ Saving 'data_info_list' to 'data/data_info_list.rda'
• Document your data (see 'https://r-pkgs.org/data.html')
usethis::use_data(data_info_nested, overwrite = T)✔ Saving 'data_info_nested' to 'data/data_info_nested.rda'
• Document your data (see 'https://r-pkgs.org/data.html')